{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "2dd664d1",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n",
    "\n",
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "import t5\n",
    "import functools\n",
    "from t5 import models\n",
    "\n",
    "# !wget https://f000.backblazeb2.com/file/malaya-model/bpe/sp10m.cased.ms-en.model\n",
    "vocab = 'sp10m.cased.ms-en.model'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "47fb7029",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install t5==0.9.3 --no-deps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "bb18a0a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "albert-tensorflow==1.1\r\n",
      "bert-tensorflow==1.0.1\r\n",
      "mesh-tensorflow==0.1.21\r\n",
      "nvidia-tensorflow==1.15.5+nv22.5\r\n",
      "rotary-embedding-tensorflow==0.1.1\r\n",
      "tensorflow-addons==0.12.0\r\n",
      "tensorflow-datasets==4.6.0\r\n",
      "tensorflow-estimator==2.6.0\r\n",
      "tensorflow-gpu==2.6.0\r\n",
      "tensorflow-hub==0.12.0\r\n",
      "tensorflow-io-gcs-filesystem==0.24.0\r\n",
      "tensorflow-metadata==1.7.0\r\n",
      "tensorflow-model-analysis==0.38.0\r\n",
      "tensorflow-probability==0.13.0\r\n",
      "tensorflow-serving-api==2.8.0\r\n",
      "tensorflow-text==2.6.0\r\n"
     ]
    }
   ],
   "source": [
    "!pip3 freeze | grep 'tensorflow'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2b6e3939",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "keras==2.6.0\r\n"
     ]
    }
   ],
   "source": [
    "!pip3 freeze | grep 'keras'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "237f0e54",
   "metadata": {},
   "outputs": [],
   "source": [
    "with tf.io.gfile.GFile('test.tsv', \"w\") as outfile:\n",
    "    l = 'i like u'\n",
    "    r = 'saya suka awak'\n",
    "    outfile.write(\"%s\\t%s\\n\" % (l, r))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f7427532",
   "metadata": {},
   "outputs": [],
   "source": [
    "def translation_dataset(split, shuffle_files=False):\n",
    "    del shuffle_files\n",
    "    ds = tf.data.TextLineDataset(\n",
    "        [\n",
    "            'test.tsv'\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    ds = ds.map(\n",
    "        functools.partial(\n",
    "            tf.io.decode_csv,\n",
    "            record_defaults=['', ''],\n",
    "            field_delim='\\t',\n",
    "            use_quote_delim=False,\n",
    "        ),\n",
    "        num_parallel_calls=tf.data.experimental.AUTOTUNE,\n",
    "    )\n",
    "    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))\n",
    "    return ds\n",
    "\n",
    "\n",
    "def translation_preprocessor(ds):\n",
    "    def to_inputs_and_targets(ex):\n",
    "        return {\n",
    "            'inputs': tf.strings.join(['terjemah Inggeris ke Melayu: ', ex['question']]),\n",
    "            'targets': ex['answer'],\n",
    "        }\n",
    "\n",
    "    return ds.map(\n",
    "        to_inputs_and_targets,\n",
    "        num_parallel_calls=tf.data.experimental.AUTOTUNE,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "87d9412f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import seqio\n",
    "\n",
    "DEFAULT_SPM_PATH = vocab\n",
    "DEFAULT_EXTRA_IDS = 100\n",
    "\n",
    "\n",
    "def get_default_vocabulary():\n",
    "    return seqio.SentencePieceVocabulary(DEFAULT_SPM_PATH, DEFAULT_EXTRA_IDS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "f5d396ae",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<seqio.dataset_providers.Mixture at 0x7f414172f5b0>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t5.data.TaskRegistry.remove('translation_dataset')\n",
    "\n",
    "t5.data.TaskRegistry.add(\n",
    "    'translation_dataset',\n",
    "    dataset_fn=translation_dataset,\n",
    "    splits=['train'],\n",
    "    text_preprocessor=[translation_preprocessor],\n",
    "    postprocess_fn=t5.data.postprocessors.lower_text,\n",
    "    metric_fns=[t5.evaluation.metrics.accuracy],\n",
    "    output_features = seqio.Feature(get_default_vocabulary())\n",
    ")\n",
    "t5.data.MixtureRegistry.remove('translation_bahasa')\n",
    "t5.data.MixtureRegistry.add(\n",
    "    'translation_bahasa',\n",
    "    ['translation_dataset'],\n",
    "    default_rate=1.0,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "13d7473a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-07-05 14:05:23.312228: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n",
      "2022-07-05 14:05:23.312255: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: huseincomel-desktop\n",
      "2022-07-05 14:05:23.312258: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: huseincomel-desktop\n",
      "2022-07-05 14:05:23.312301: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program\n",
      "2022-07-05 14:05:23.312319: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.129.6\n",
      "2022-07-05 14:05:23.312488: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "nq_task = t5.data.TaskRegistry.get(\"translation_dataset\")\n",
    "ds = nq_task.get_dataset(split='knowledge-graph.tsv', sequence_length={\"inputs\": 512, \"targets\": 512})\n",
    "r = tfds.as_numpy(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "cdb652c3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-07-05 14:05:23.797967: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'inputs_pretokenized': b'terjemah Inggeris ke Melayu: i like u',\n",
       " 'inputs': array([   13, 26087,  2040,    55,  1550,    31,    13,    91,   164,\n",
       "           13,   354,     1], dtype=int32),\n",
       " 'targets_pretokenized': b'saya suka awak',\n",
       " 'targets': array([   67,  1259, 12367,     1], dtype=int32)}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(r._make_iterator_fn())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "a5016958",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_parallelism, train_batch_size, keep_checkpoint_max = 1, 2, 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "31d5ca4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE_DIR = 'out'\n",
    "model = t5.models.MtfModel(\n",
    "    model_dir=BASE_DIR,\n",
    "    tpu=None,\n",
    "    tpu_topology=None,\n",
    "    model_parallelism=model_parallelism,\n",
    "    batch_size=train_batch_size,\n",
    "    sequence_length={'inputs': 512, 'targets': 512},\n",
    "    learning_rate_schedule=0.0005,\n",
    "    save_checkpoints_steps=5,\n",
    "    keep_checkpoint_max=5,\n",
    "    iterations_per_loop=100,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "55a0f7ba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "checkpoint\t\t\t\tmodel.ckpt-1000000.index\r\n",
      "model.ckpt-1000000.data-00000-of-00002\tmodel.ckpt-1000000.meta\r\n",
      "model.ckpt-1000000.data-00001-of-00002\toperative_config.gin\r\n"
     ]
    }
   ],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malaya-model/pretrained/t5-tiny-social-media-2021-11-15.tar.gz\n",
    "# !tar -zxf t5-tiny-social-media-2021-11-15.tar.gz\n",
    "!ls t5-tiny-social-media"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "fe312649",
   "metadata": {},
   "outputs": [],
   "source": [
    "FINETUNE_STEPS = 50000\n",
    "MODEL_DIR = 't5-tiny-social-media'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "80a1b0b1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'t5-tiny-social-media/model.ckpt-1000000'"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('t5-tiny-social-media/checkpoint', 'w') as fopen:\n",
    "    fopen.write('model_checkpoint_path: \"model.ckpt-1000000\"')\n",
    "    \n",
    "tf.train.latest_checkpoint('t5-tiny-social-media')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7811234b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Using config: {'_model_dir': 'out', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n",
      "graph_options {\n",
      "  rewrite_options {\n",
      "    meta_optimizer_iterations: ONE\n",
      "  }\n",
      "}\n",
      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100, num_shards=None, num_cores_per_replica=1, per_host_input_for_training=4, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None, eval_training_input_configuration=2, experimental_host_call_every_n_steps=1, experimental_allow_per_host_v2_parallel_get_next=False, experimental_feed_hook=None), '_cluster': None}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Using config: {'_model_dir': 'out', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n",
      "graph_options {\n",
      "  rewrite_options {\n",
      "    meta_optimizer_iterations: ONE\n",
      "  }\n",
      "}\n",
      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100, num_shards=None, num_cores_per_replica=1, per_host_input_for_training=4, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None, eval_training_input_configuration=2, experimental_host_call_every_n_steps=1, experimental_allow_per_host_v2_parallel_get_next=False, experimental_feed_hook=None), '_cluster': None}\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/training/training_util.py:235: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/training/training_util.py:235: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.\n"
     ]
    }
   ],
   "source": [
    "model.finetune(\n",
    "    mixture_or_task_name='translation_dataset',\n",
    "    pretrained_model_dir=MODEL_DIR,\n",
    "    finetune_steps=FINETUNE_STEPS,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b08b71b9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
